Import and explore dataset

In [1]:
import os, base64, re, logging

from elasticsearch import Elasticsearch
from elasticsearch import helpers

import pandas as pd
import numpy as np

from datetime import datetime, timedelta

# sankey charts
import floweaver
import csv
from pprint import pprint

# map
import folium

# process issue and solution text
from nltk import RegexpParser, sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import download as nltk_download
from string import punctuation

# optional
#from functools import reduce
from itertools import chain

# gocode location
import requests

# api ip and passwords
from helpers import config
In [2]:
restart_df = pd.read_csv('./data/devices_2.csv', header=0, parse_dates = ['Date'], encoding = "utf-8")
In [3]:
restart_df.head()
Out[3]:
Product Category Brand Comments Repair Status Spare parts (needed/used) Event Group Date
0 Laptop medium Apple Needs new screen assembly Repairable No The Hong Kong University of Science and Techno... REMAKE 2019-03-31
1 Mobile Apple Cracked screen assembly Fixed No The Hong Kong University of Science and Techno... REMAKE 2019-03-31
2 Mobile Apple Cracjed rear case glass Repairable No The Hong Kong University of Science and Techno... REMAKE 2019-03-31
3 Mobile Samsung Screen blacked out after physical impact Repairable No The Hong Kong University of Science and Techno... REMAKE 2019-03-31
4 Mobile Samsung No display on the screen, light at the bottom ... Repairable No The Hong Kong University of Science and Techno... REMAKE 2019-03-31
In [4]:
Categories = restart_df[['Product Category','Brand']].groupby('Product Category').agg('count').reset_index()
Categories.columns = ['Product Category','Records_Count']
Categories
Out[4]:
Product Category Records_Count
0 Aircon/Dehumidifier 15
1 DLSR / Video Camera 25
2 Decorative or safety lights 16
3 Desktop computer 35
4 Digital Compact Camera 76
5 Fan 23
6 Flat screen 15-17" 6
7 Flat screen 19-20" 8
8 Flat screen 22-24" 9
9 Flat screen 26-30" 22
10 Flat screen 32-37" 23
11 Hair & Beauty item 70
12 Handheld entertainment device 132
13 Headphones 103
14 Hi-Fi integrated 86
15 Hi-Fi separates 164
16 Kettle 62
17 Lamp 58
18 Laptop large 55
19 Laptop medium 783
20 Laptop small 54
21 Misc 354
22 Mobile 852
23 Musical instrument 28
24 PC Accessory 86
25 Paper shredder 16
26 Portable radio 218
27 Power tool 89
28 Printer/scanner 128
29 Projector 4
30 Small kitchen item 291
31 TV and gaming-related accessories 110
32 Tablet 231
33 Toaster 91
34 Toy 31
35 Vacuum 145
In [5]:
Repair_Status = restart_df[['Repair Status','Brand']].groupby('Repair Status').agg('count').reset_index()
Repair_Status.columns = ['Repair Status','Records_Count']
Repair_Status
Out[5]:
Repair Status Records_Count
0 End of life 713
1 Fixed 2186
2 Repairable 1578
3 Unknown 22
In [6]:
Event_Location = restart_df[['Event','Brand']].groupby('Event').agg('count').reset_index()
Event_Location.columns = ['Event','Records_Count']
Event_Location[0:10]
Out[6]:
Event Records_Count
0 Chiesa Evangelica Metodista 7
1 Librarie La Passion du Libr' ere Tunis 0
2 Piazza Etrusca, 5, 50061 Compiobbi (FI) 0
3 Via Faentina, 43r, 50133 Firenze 0
4 Via di Montughi, 1, 50134 Firenze 0
5 #Futuremaker 5
6 33 Central Park West, New York, NY 10023 27
7 33 Central Park West, New York, NY 10025 70
8 3901 Fieldston Road Bronx, NY 36
9 3901 Fieldston Road, Bronx, NY 4
In [7]:
Spare_parts = restart_df[['Spare parts (needed/used)','Brand']].groupby('Spare parts (needed/used)').agg('count').reset_index()
Spare_parts.columns = ['Spare parts (needed/used)','Records_Count']
Spare_parts
Out[7]:
Spare parts (needed/used) Records_Count
0 No 4499
In [8]:
Restart_Group = restart_df[['Group','Brand']].groupby('Group').agg('count').reset_index()
Restart_Group.columns = ['Group','Records_Count']
Restart_Group.head()
Out[8]:
Group Records_Count
0 Bedpop Repair Shop 13
1 Brentford Recycling Action Group 2
2 Brixton Remakery 49
3 Cambridge Area Repair Cafes 11
4 Chesterfield Repair Cafe 45

Geocode location

In [9]:
locations = Restart_Group['Group'].values
locations[0:4]
Out[9]:
array(['Bedpop Repair Shop', 'Brentford Recycling Action Group ',
       'Brixton Remakery', 'Cambridge Area Repair Cafes'], dtype=object)
In [10]:
GOOGLE_MAPS_API_URL = config.GOOGLE_MAPS_API_URL 
api_key = config.api_key
In [11]:
geo_location = {}

for loc in locations:
    params = {
    'address': loc,
    'key' : api_key    
    }
    req = requests.get(GOOGLE_MAPS_API_URL, params=params)
    res = req.json()
    
    if res['status'] == 'ZERO_RESULTS':
        geodata = []
        # put unknown into atlantic ocean
        geodata.append(14.5994)
        geodata.append(28.6731)
 
 
    else:    
        result = res['results'][0]
        geodata = []
        geodata.append(result['geometry']['location']['lat'])
        geodata.append(result['geometry']['location']['lng'])

    
    geo_location[loc] = geodata
    
In [12]:
geo_location['Brixton Remakery']
Out[12]:
[51.47050970000001, -0.1033583]
In [13]:
# save to disc here to avoid having to call api all the time

Data Cleaning

In [14]:
del restart_df['Spare parts (needed/used)']
In [15]:
restart_df['Brand'] = restart_df['Brand'].replace(np.nan,'Unknown',regex=True)
restart_df['Comments'] = restart_df['Comments'].replace(np.nan,'None',regex=True)
restart_df['Event'] = restart_df['Event'].replace(np.nan,'Unknown',regex=True)
restart_df['Group'] = restart_df['Group'].replace(np.nan,'Unknown',regex=True)
In [16]:
restart_df['Location_Coordinates'] = restart_df['Group'].apply(lambda x: geo_location[x])
In [17]:
restart_df.columns
Out[17]:
Index(['Product Category', 'Brand', 'Comments', 'Repair Status', 'Event',
       'Group', 'Date', 'Location_Coordinates'],
      dtype='object')
In [18]:
restart_df.columns = ['Category', 'Brand', 'Comments', 'Repair_Status', 'Event_Location', 'Restart_Group', 'Event_Date', 'Location_Coordinates']
In [19]:
restart_df.head()
Out[19]:
Category Brand Comments Repair_Status Event_Location Restart_Group Event_Date Location_Coordinates
0 Laptop medium Apple Needs new screen assembly Repairable The Hong Kong University of Science and Techno... REMAKE 2019-03-31 [40.4480371, -80.0024407]
1 Mobile Apple Cracked screen assembly Fixed The Hong Kong University of Science and Techno... REMAKE 2019-03-31 [40.4480371, -80.0024407]
2 Mobile Apple Cracjed rear case glass Repairable The Hong Kong University of Science and Techno... REMAKE 2019-03-31 [40.4480371, -80.0024407]
3 Mobile Samsung Screen blacked out after physical impact Repairable The Hong Kong University of Science and Techno... REMAKE 2019-03-31 [40.4480371, -80.0024407]
4 Mobile Samsung No display on the screen, light at the bottom ... Repairable The Hong Kong University of Science and Techno... REMAKE 2019-03-31 [40.4480371, -80.0024407]

Sankey chart of product Brand and Category to repair group and repair status

In [20]:
# graph datasets

Category_Brand = restart_df[['Category','Brand','Event_Date']]\
        [ (restart_df['Category'] == 'Desktop computer') \
        | (restart_df['Category'] == 'Laptop large') \
        | (restart_df['Category'] == 'Laptop medium') \
        | (restart_df['Category'] == 'Laptop small') \
        | (restart_df['Category'] == 'Mobile') \
        | (restart_df['Category'] == 'Tablet') \
        ].groupby(['Category','Brand']).count().reset_index()
Category_Brand.columns=['source','target','value']
Category_Brand.head()
Out[20]:
source target value
0 Desktop computer ASUS 2
1 Desktop computer Ace 1
2 Desktop computer Acer 3
3 Desktop computer Apple 3
4 Desktop computer Dell 5
In [21]:
Category_Repair_Status = restart_df[['Category','Repair_Status','Event_Date']]\
    [ (restart_df['Category'] == 'Desktop computer') \
     | (restart_df['Category'] == 'Laptop large') \
     | (restart_df['Category'] == 'Laptop medium') \
     | (restart_df['Category'] == 'Laptop small') \
     | (restart_df['Category'] == 'Mobile') \
     | (restart_df['Category'] == 'Tablet') \
    ].groupby(['Category','Repair_Status']).count().reset_index()
Category_Repair_Status.columns=['source','target','value']
flows_df1 = Category_Repair_Status
flows_df1['type'] = flows_df1['source']
Category_Repair_Status.head()
Out[21]:
source target value type
0 Desktop computer End of life 33 Desktop computer
1 Desktop computer Fixed 107 Desktop computer
2 Desktop computer Repairable 57 Desktop computer
3 Desktop computer Unknown 1 Desktop computer
4 Laptop large End of life 6 Laptop large
In [22]:
Brand_Repair_Status = restart_df[['Brand','Repair_Status','Event_Date']]\
    [ (restart_df['Category'] == 'Desktop computer') \
     | (restart_df['Category'] == 'Laptop large') \
     | (restart_df['Category'] == 'Laptop medium') \
     | (restart_df['Category'] == 'Laptop small') \
     | (restart_df['Category'] == 'Mobile') \
     | (restart_df['Category'] == 'Tablet') \
    ].groupby(['Brand','Repair_Status']).count().reset_index()
Brand_Repair_Status.columns=['source','target','value']
flows_df2 = Brand_Repair_Status
Brand_Repair_Status.head()
Out[22]:
source target value
0 ASUS End of life 6
1 ASUS Fixed 40
2 ASUS Repairable 35
3 ASUS Unknown 1
4 Ace Fixed 1
In [23]:
flows_df3 = pd.concat([Category_Brand, Brand_Repair_Status])
In [24]:
flows_df3.head()
Out[24]:
source target value
0 Desktop computer ASUS 2
1 Desktop computer Ace 1
2 Desktop computer Acer 3
3 Desktop computer Apple 3
4 Desktop computer Dell 5
In [25]:
#https://sankeyview.readthedocs.io/en/latest/tutorials/quickstart.html

# Set the default size to fit the documentation better.
size = dict(width=800, height=420)

nodes = {
    'product_categories': floweaver.ProcessGroup(['Desktop computer', 'Laptop large', 'Laptop medium', 'Laptop small', 'Mobile', 'Tablet']),
    'repair_status': floweaver.ProcessGroup(['End of life', 'Fixed', 'Repairable', 'Unknown']),
}

ordering = [
    ['product_categories'],       # put "farms" on the left...
    ['repair_status'],   # ... and "customers" on the right.
]

bundles = [floweaver.Bundle('product_categories', 'repair_status')]

# category and type keep same
products_by_Category = floweaver.Partition.Simple('type', ['Desktop computer', 'Laptop large', 'Laptop medium', 'Laptop small', 'Mobile', 'Tablet'])

# Set the colours for the labels in the partition.
palette = {'Laptop medium': 'yellowgreen', 'Laptop large': 'green', 'Laptop small': 'gold', 'Tablet': 'orange', 'Mobile': 'red', 'Desktop computer': 'blue'}

Categories_partition = floweaver.Partition.Simple('process', [
'Desktop computer', 'Laptop large', 'Laptop medium', 'Laptop small', 'Mobile', 'Tablet'
])

# This is another partition.
Repair_status_partition = floweaver.Partition.Simple('process', [
    'End of life', 'Fixed', 'Repairable', 'Unknown'
])

# Update the ProcessGroup nodes to use the partitions
nodes['product_categories'].partition = Categories_partition
nodes['repair_status'].partition = Repair_status_partition
In [26]:
sdd = floweaver.SankeyDefinition(nodes, bundles, ordering,
                       flow_partition=products_by_Category)

floweaver.weave(sdd, flows_df1, palette=palette).to_widget(**size).auto_save_png('categiries_repair_status2.png')

Map out locations of repairs using geocoded location based on Restart Group name

In [27]:
from folium.plugins import MarkerCluster

restart_map = folium.Map(location=[51.5, -0.1], zoom_start=11)

marker_cluster = MarkerCluster().add_to(restart_map)

for ix, point in enumerate(restart_df['Location_Coordinates'].values):
    folium.Marker(point
                  , popup = 'Restart Group: ' + restart_df['Restart_Group'].iloc[ix] + "\n Location: " + restart_df['Event_Location'].iloc[ix]
                 ).add_to(marker_cluster)
    
In [28]:
restart_map
Out[28]:
In [83]:
restart_map.save(outfile="./viz/restart_map.html")

Keywords by Category

In [27]:
# download corpora into venv. Alternative is to save it to project folder and add it to path like this :
# nltk.data.path.append('/home/xxx/xxx/restart')
#nltk_download('punkt', download_dir='/home/elena/anaconda3/envs/restart/nltk_data')
In [28]:
#nltk_download('averaged_perceptron_tagger', download_dir='/home/elena/anaconda3/envs/restart/nltk_data')
In [83]:
#nltk_download('stopwords', download_dir='/home/elena/anaconda3/envs/restart/nltk_data')
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/elena/anaconda3/envs/restart/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Out[83]:
True
In [11]:
#nltk_download('wordnet', download_dir='/home/elena/anaconda3/envs/restart/nltk_data')
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/elena/anaconda3/envs/restart/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Out[11]:
True
In [29]:
sample_text = """Audio volume poor; New speaker needed. Loose Connection, Battery damage. Won't start, needs new screen. Needs screen replacement"""
In [30]:
# https://nbviewer.jupyter.org/github/lukewrites/NP_chunking_with_nltk/blob/master/NP_chunking_with_the_NLTK.ipynb


# extract nouns and verbs, plus noun and verb phrases from comments to get keywords
# this is instead of tf-idf, because these texts are short and have nothing other than a list of issues and solutions

patterns = """
    NP:    {<DT><WP><VBP>*<RB>*<VBN><IN><NN>}
           {<JJ>*<NN|NNS|NNP|NNPS><CC>*<NN|NNS|NNP|NNPS>+}
           {<JJ>*<NN|NNS|NNP|NNPS>+}

          """ 
    
    
    #            {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}
    #           {<NN|NNS|NNP|NNPS><IN>*<NN|NNS|NNP|NNPS>+}
    
    
NPChunker = RegexpParser(patterns)
lemmatizer = WordNetLemmatizer()

extra_stopwords = ["None","did", "didn't", "do", "will", "won't", "wouldn't", "Did", "Didn't", "Don't", "Doesn't",
                   "Will", "Won't", "Wouldn't", "Wo", "wo", "Do", "Di", "ok", "OK", "n't", "needs", "need", "doesn't", 
                   "Can't", "can't", "cannot", "Cannot", "Does", "does", "needs", "Needs", "Need", "need", "n't", 'Ca',
                  "%", "+", "<", ">", "*","’","%","/","'",'"',"]","[",")","(","'",""]
include_words = ["start", "Audio", "audio", "switch", "battery", "not"]


# TODO remove all punctuation from a word apart from mid word hyphen

def prepare_text(contents):
    
    # Tokenize the text into sentences.
    tokenized_sentences = sent_tokenize(contents)  
    
    # Tokenize words in sentences. Unfortunately, it splits these 'Does', "n't", 'Ca', "n't", so removing using stopwords 
    tokenized_words = [word_tokenize(sentence) for sentence in tokenized_sentences]  
    
    #tokenized_words_lower = [ [sentence[0].lower()] + sentence[1:]  for sentence in tokenized_words ]
    
    tagged_words = [pos_tag([ word.replace('-','').replace("'",'').replace(".",'') for word in
                        [ word for word in chunk if word not in stopwords.words('english')+extra_stopwords ]
                        if len(word.replace('-','').replace("'",'').replace(".",''))>0
                        ]) \
                    for chunk in tokenized_words ]  # Tag words for POS in each sentence.
    word_tree = [NPChunker.parse(word) for word in tagged_words]  # Identify NP chunks
    
    return word_tree  # Return the tagged & chunked sentences.


def return_a_list_of_NPs(comment):
    
    if (comment == '') | (comment == 'None'):
        return []
    
    sentences = prepare_text(comment) 
    
    nps = []  # an empty list in which to NPs will be stored.
    for sent in sentences:
        tree = NPChunker.parse(sent)
        for subtree in tree.subtrees():

            if (subtree.label() == 'NP' ) | (( (subtree.label() == 'NN') | (subtree.label() == 'VB')) & (len([word for word, tag in subtree.leaves() if word in include_words]) >=1) ):
                t = subtree
                if len([word for word, tag in t.leaves()]) == 1:
                    
                    t = lemmatizer.lemmatize(t.leaves()[0][0])
                else:
                    t = ' '.join(word for word, tag in t.leaves())
                nps.append(t)
    return nps


return_a_list_of_NPs(sample_text )
Out[30]:
['Audio volume',
 'New speaker',
 'Loose Connection',
 'Battery damage',
 'start',
 'new screen',
 'screen replacement']
In [31]:
# process dataframe
restart_df['Keywords'] = restart_df['Comments'].apply(lambda x: return_a_list_of_NPs(x))
Warning: parsing empty text
Warning: parsing empty text
Warning: parsing empty text
Warning: parsing empty text
Warning: parsing empty text
Warning: parsing empty text
Warning: parsing empty text
Warning: parsing empty text
Warning: parsing empty text
Warning: parsing empty text
Warning: parsing empty text
Warning: parsing empty text
In [32]:
restart_df[['Comments','Keywords']][restart_df['Comments'] != 'None'].head()
Out[32]:
Comments Keywords
0 Needs new screen assembly [new screen assembly]
1 Cracked screen assembly [screen assembly]
2 Cracjed rear case glass [Cracjed, rear case glass]
3 Screen blacked out after physical impact [Screen, physical impact]
4 No display on the screen, light at the bottom ... [display screen, light bottom research issue]
In [33]:
keywords = restart_df['Keywords'].values
In [34]:
tag_list = list(chain(*restart_df['Keywords'].values.flat))

#tag_counts = [(tag, tag_list.count(tag)) for tag in np.unique(np.array(tag_list)) if ((tag_list.count(tag)>1) | (tag in include_words ))]
In [35]:
tag, counts = np.unique(np.array(tag_list),return_counts=True)
#for item in np.asarray((tag, counts)).T:

# all tags
tags_with_counts_all = ([ [tag, tag.lower(), count] for tag, count in zip(tag, counts) ])

# lowered tags only
tag, counts = np.unique(np.array([tag.lower() for tag in tag_list]),return_counts=True)
tags_with_counts_lower = ([ [tag, count] for tag, count in zip(tag, counts) ])
In [36]:
df_lower = pd.DataFrame(tags_with_counts_lower, columns = ['lower', 'count_lower'])
df_all = pd.DataFrame(tags_with_counts_all, columns = ['tag','lower','count_all'])
In [37]:
df_merged = pd.merge(df_all, df_lower, how = 'left', left_on = 'lower', right_on = 'lower')
In [38]:
#df_merged.sort_values(by=['lower','count_all'], ascending = [True, False])[df_merged.count_lower>1]
In [39]:
df_merged['count_instnce'] = df_merged.sort_values(by=['lower','count_all'], ascending = [True,False])\
                                        .groupby(['lower'])\
                                        .cumcount() + 1

df_merged.sort_values(by=['lower','count_all'], ascending = ['True','False'])[df_merged.count_lower>1].head(20)
/home/elena/anaconda3/envs/restart/lib/python3.5/site-packages/ipykernel_launcher.py:5: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """
Out[39]:
tag lower count_all count_lower count_instnce
3860 able fix able fix 3 3 1
3873 access access 2 2 1
3883 access repair access repair 2 2 1
3888 activity activity 6 6 1
3890 adapter adapter 2 2 1
18 Added RAM added ram 15 15 1
3898 adhesive remover adhesive remover 2 2 1
26 Adv adv 4 4 1
3904 advice advice 66 143 2
32 Advice advice 77 143 1
3905 advice advice advice advice 2 2 1
33 Advice Given advice given 2 2 1
3910 advies advies 1 4 2
38 Advies advies 3 4 1
40 Advised get advised get 3 3 1
50 Aerial Broken aerial broken 1 2 1
51 Aerial broken aerial broken 1 2 2
3923 age age 4 4 1
53 Air air 1 11 2
3925 air air 10 11 1
In [40]:
[(k,v) for v, k in df_merged[['tag','lower']].values][0:10]
Out[40]:
[('//helpubuntucom/community/synaptic/packagedownloadscript',
  '//helpubuntucom/community/Synaptic/PackageDownloadScript'),
 ('//maglitecom/support/faq', '//maglitecom/support/faq'),
 ('== clean connectors', '== clean connectors'),
 ('a3 printer', 'A3 printer'),
 ('aa', 'AA'),
 ('am band', 'AM band'),
 ('am/fm radio', 'AM/FM radio'),
 ('amp speaker fuses', 'AMP speaker fuses'),
 ('asus connect network', 'ASUS connect network'),
 ('av receiver dan', 'AV receiver Dan')]
In [41]:
df_merged = df_merged[(df_merged.count_instnce == 1) & (df_merged.count_all >1)]
In [42]:
popular_tags2 = ([tag for tag, count in zip(tag, counts) if count>1]) #1465 vs 7k
len(popular_tags2), popular_tags2[0:20]
Out[42]:
(1353,
 ['able fix',
  'access',
  'access repair',
  'activity',
  'adapter',
  'added ram',
  'adhesive remover',
  'adv',
  'advice',
  'advice advice',
  'advice given',
  'advies',
  'advised get',
  'aerial broken',
  'age',
  'air',
  'air fryer',
  'alarm',
  'alarm clock',
  'alarm clock radio'])
In [43]:
popular_tags = df_merged.tag.values
In [44]:
# need to lemmatise (Appeared, Appears) and possibly get rid of standalone verbs
popular_tags[0:25], len(popular_tags)
Out[44]:
(array(['Added RAM', 'Adv', 'Advice', 'Advice Given', 'Advies',
        'Advised get', 'Alarm clock', 'Amp', 'Amplifier', 'Android',
        'Antenna', 'Antique', 'Appeared', 'Appears', 'Apple',
        'Apple store', 'Audio component', 'Audio component No power',
        'Audio component No sound output', 'Audio device',
        'Audio equipment', 'Audio recorder', 'B200 error', 'Bad battery',
        'Bad cable'], dtype=object), 1221)
In [45]:
# create lookup
common_tag_lookup = dict( [(k,v) for v, k in df_merged[(df_merged.count_instnce == 1) & (df_merged.count_all >1)][['tag','lower']].values])
In [46]:
common_tag_lookup['dab']
Out[46]:
'DAB'
In [47]:
# create column with common form of only popular keywords
restart_df['keywords_popular'] = restart_df['Keywords'].apply(lambda x: [common_tag_lookup[keyword] for keyword in x if keyword in common_tag_lookup])
In [48]:
del restart_df['Keywords']
In [49]:
restart_df = restart_df.rename(columns={'keywords_popular': 'Keywords'})
In [50]:
restart_df.columns
Out[50]:
Index(['Category', 'Brand', 'Comments', 'Repair_Status', 'Event_Location',
       'Restart_Group', 'Event_Date', 'Location_Coordinates', 'Keywords'],
      dtype='object')
In [51]:
# Doesn't work maybe need to times 1000 or something

def totimestamp(dt, epoch=datetime(1970,1,1)):
    #dtarr = dtstr.split('-')
    #dt = datetime(int(dtarr[0]), int(dtarr[1]), int(dtarr[2]))
    td = dt - epoch
    return int(1000*(td.microseconds + (td.seconds + td.days * 86400) * 10**6) / 10**6) 

now = datetime.utcnow()
print (now)
print (totimestamp(now))

#totimestamp('2019-03-31')
2019-08-28 16:56:49.975155
1567011409975
In [52]:
restart_df['unix_timestamp'] = restart_df['Event_Date'].apply(lambda x : totimestamp(x))
In [53]:
restart_df.head()
Out[53]:
Category Brand Comments Repair_Status Event_Location Restart_Group Event_Date Location_Coordinates Keywords unix_timestamp
0 Laptop medium Apple Needs new screen assembly Repairable The Hong Kong University of Science and Techno... REMAKE 2019-03-31 [40.4480371, -80.0024407] [new screen assembly] 1553990400000
1 Mobile Apple Cracked screen assembly Fixed The Hong Kong University of Science and Techno... REMAKE 2019-03-31 [40.4480371, -80.0024407] [screen assembly] 1553990400000
2 Mobile Apple Cracjed rear case glass Repairable The Hong Kong University of Science and Techno... REMAKE 2019-03-31 [40.4480371, -80.0024407] [] 1553990400000
3 Mobile Samsung Screen blacked out after physical impact Repairable The Hong Kong University of Science and Techno... REMAKE 2019-03-31 [40.4480371, -80.0024407] [] 1553990400000
4 Mobile Samsung No display on the screen, light at the bottom ... Repairable The Hong Kong University of Science and Techno... REMAKE 2019-03-31 [40.4480371, -80.0024407] [] 1553990400000
In [56]:
restart_df.reset_index(inplace = True)
In [57]:
restart_df.columns
Out[57]:
Index(['index', 'Category', 'Brand', 'Comments', 'Repair_Status',
       'Event_Location', 'Restart_Group', 'Event_Date', 'Location_Coordinates',
       'Keywords', 'unix_timestamp'],
      dtype='object')
In [68]:
#restart_df['Location_Coordinates'] = restart_df['Location_Coordinates'].apply(lambda x: str(x.split(' ')[0])+', '+str(x.split(' ')[1]))
In [69]:
restart_df.head()
Out[69]:
index Category Brand Comments Repair_Status Event_Location Restart_Group Event_Date Location_Coordinates Keywords unix_timestamp
0 0 Laptop medium Apple Needs new screen assembly Repairable The Hong Kong University of Science and Techno... REMAKE 2019-03-31 40.4480371, -80.0024407 [new screen assembly] 1553990400000
1 1 Mobile Apple Cracked screen assembly Fixed The Hong Kong University of Science and Techno... REMAKE 2019-03-31 40.4480371, -80.0024407 [screen assembly] 1553990400000
2 2 Mobile Apple Cracjed rear case glass Repairable The Hong Kong University of Science and Techno... REMAKE 2019-03-31 40.4480371, -80.0024407 [] 1553990400000
3 3 Mobile Samsung Screen blacked out after physical impact Repairable The Hong Kong University of Science and Techno... REMAKE 2019-03-31 40.4480371, -80.0024407 [] 1553990400000
4 4 Mobile Samsung No display on the screen, light at the bottom ... Repairable The Hong Kong University of Science and Techno... REMAKE 2019-03-31 40.4480371, -80.0024407 [] 1553990400000
In [14]:
# split noun phrases? e.g. Broken Screen and Screen
# ensure 'not' is part of noun phrase e.g. screen not working

Elastic Load

In [58]:
logging.basicConfig(level=logging.INFO)

# url and credentials go into service account's .bashrc or profile in it's home directory
os.environ['BONSAI_URL'] = config.bonsai_url
port = config.bonsai_port
bonsai = os.environ['BONSAI_URL']

auth = re.search('https\:\/\/(.*)\@', bonsai).group(1).split(':')

host = bonsai.replace('https://{}:{}@'.format(auth[0], auth[1]), '')

es_header = [{
    'host': host,
    'port' : port,
    'use_ssl': True,
    'http_auth': (auth[0], auth[1])
}]

es = Elasticsearch(es_header)

es.ping()
INFO:elasticsearch:HEAD https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/ [status:200 request:0.433s]
Out[58]:
True
In [59]:
es.indices.delete(index='restart_index', ignore=[400, 404])
INFO:elasticsearch:DELETE https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/restart_index [status:200 request:0.131s]
Out[59]:
{'acknowledged': True}
In [70]:
es.indices.get_alias("*")
INFO:elasticsearch:GET https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/*/_alias [status:200 request:0.446s]
Out[70]:
{'restart_index': {'aliases': {}}}
In [8]:
#es.indices.delete(index='.kibana_1', ignore=[400, 404])
INFO:elasticsearch:DELETE https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/.kibana_1 [status:200 request:0.204s]
Out[8]:
{'acknowledged': True}
In [61]:
request_body = {
    "settings" : {
        "number_of_shards": 3,
        "number_of_replicas": 1
    },

    'mappings': {
            'properties': {
                'Category': {'type': 'keyword',
                            'ignore_above': 256,
                             'eager_global_ordinals' : 'true'
                            },
                'Brand': {'type': 'keyword',
                            'ignore_above': 256,
                             'eager_global_ordinals' : 'true'
                            },
                'Comments': {'type': 'text',
                             'index_phrases' : 'true'
                            },    
                'Repair_Status': {'type': 'keyword',
                            'ignore_above': 256,
                             'eager_global_ordinals' : 'true'
                            },
                'Event_Location' : {'type': 'keyword',
                             'eager_global_ordinals' : 'true'
                            },
                'Restart_Group' : {'type': 'keyword',
                             'eager_global_ordinals' : 'true'
                            },
                'Event_Date' : {'type' : 'date'
                            },
                "Location_Coordinates": {"type": "geo_point",
                                         "null_value" : "14.5994, 28.6731",
                                         "ignore_malformed" : "true"
                            },
                'Keywords' : {'type': 'keyword',
                             'eager_global_ordinals' : 'true'
                            },
                'Unix_Timestamp' : {'type': 'date'                    
                            }
          
            }}
}


es.indices.create(index = 'restart_index', body = request_body)
INFO:elasticsearch:PUT https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/restart_index [status:200 request:0.168s]
Out[61]:
{'acknowledged': True, 'index': 'restart_index', 'shards_acknowledged': True}
In [62]:
es.indices.exists(index='restart_index')
INFO:elasticsearch:HEAD https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/restart_index [status:200 request:0.051s]
Out[62]:
True
In [73]:
#https://stackoverflow.com/questions/20288770/how-to-use-bulk-api-to-store-the-keywords-in-es-by-using-python

bulk_data = []

for index, row in restart_df.iterrows():
    data_dict = {}
    
    for i in range(len(row)):
        
        data_dict[restart_df.columns[i]] = row[i]        
 
        doc_dict = {
                "_index": 'restart_index',
                "_id": data_dict['index'],
                "_source": data_dict
        }
    
    bulk_data.append(doc_dict)
    #bulk_data.append(data_dict)
In [74]:
res = helpers.bulk(client = es, actions = bulk_data)
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.980s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.401s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.346s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.387s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.311s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.335s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.583s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.938s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.726s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.761s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.677s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.721s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.574s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.636s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.587s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.586s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.575s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.489s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.370s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.500s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.462s]
INFO:elasticsearch:POST https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/_bulk [status:200 request:0.364s]
In [80]:
# check data is in there, and structure in there
es.search(body={"query": {"match": {"Event_Location": "Chiesa Evangelica Metodista"}}}, index = 'restart_index')
#es.indices.get_mapping(index = 'restart_index')
INFO:elasticsearch:GET https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/restart_index/_search [status:200 request:0.834s]
Out[80]:
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 3, 'total': 3},
 'hits': {'hits': [],
  'max_score': None,
  'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 542}
In [78]:
doc = {
    'size' : 10000,
    'query': {
        'match_all' : {}
    }
}

res2 = es.search(index = 'restart_index', body=doc)
print("{} documents found".format(res2['hits']['total']))
for doc in res2['hits']['hits']:
    print("%s) %s" % (doc['_id'], doc['_source']))
WARNING:elasticsearch:GET https://restart-1-6927513437.eu-west-1.bonsaisearch.net:443/restart_index/_search [status:N/A request:27.948s]
Traceback (most recent call last):
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/response.py", line 360, in _error_catcher
    yield
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/response.py", line 438, in read
    data = self._fp.read()
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/http/client.py", line 461, in read
    s = self._safe_read(self.length)
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/http/client.py", line 607, in _safe_read
    chunk = self.fp.read(min(amt, MAXAMOUNT))
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/socket.py", line 576, in readinto
    return self._sock.recv_into(b)
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/ssl.py", line 937, in recv_into
    return self.read(nbytes, buffer)
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/ssl.py", line 799, in read
    return self._sslobj.read(len, buffer)
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/ssl.py", line 583, in read
    v = self._sslobj.read(len, buffer)
socket.timeout: The read operation timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/site-packages/elasticsearch/connection/http_urllib3.py", line 172, in perform_request
    response = self.pool.urlopen(method, url, body, retries=Retry(False), headers=request_headers, **kw)
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/connectionpool.py", line 638, in urlopen
    _stacktrace=sys.exc_info()[2])
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/util/retry.py", line 343, in increment
    raise six.reraise(type(error), error, _stacktrace)
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/packages/six.py", line 686, in reraise
    raise value
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/connectionpool.py", line 616, in urlopen
    **response_kw)
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/response.py", line 525, in from_httplib
    **response_kw)
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/response.py", line 209, in __init__
    self._body = self.read(decode_content=decode_content)
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/response.py", line 459, in read
    raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/elena/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/response.py", line 365, in _error_catcher
    raise ReadTimeoutError(self._pool, None, 'Read timed out.')
urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='restart-1-6927513437.eu-west-1.bonsaisearch.net', port=443): Read timed out.
---------------------------------------------------------------------------
timeout                                   Traceback (most recent call last)
~/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/response.py in _error_catcher(self)
    359             try:
--> 360                 yield
    361 

~/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/response.py in read(self, amt, decode_content, cache_content)
    437                 # cStringIO doesn't like amt=None
--> 438                 data = self._fp.read()
    439                 flush_decoder = True

~/anaconda3/envs/restart/lib/python3.5/http/client.py in read(self, amt)
    460                 try:
--> 461                     s = self._safe_read(self.length)
    462                 except IncompleteRead:

~/anaconda3/envs/restart/lib/python3.5/http/client.py in _safe_read(self, amt)
    606         while amt > 0:
--> 607             chunk = self.fp.read(min(amt, MAXAMOUNT))
    608             if not chunk:

~/anaconda3/envs/restart/lib/python3.5/socket.py in readinto(self, b)
    575             try:
--> 576                 return self._sock.recv_into(b)
    577             except timeout:

~/anaconda3/envs/restart/lib/python3.5/ssl.py in recv_into(self, buffer, nbytes, flags)
    936                   self.__class__)
--> 937             return self.read(nbytes, buffer)
    938         else:

~/anaconda3/envs/restart/lib/python3.5/ssl.py in read(self, len, buffer)
    798         try:
--> 799             return self._sslobj.read(len, buffer)
    800         except SSLError as x:

~/anaconda3/envs/restart/lib/python3.5/ssl.py in read(self, len, buffer)
    582         if buffer is not None:
--> 583             v = self._sslobj.read(len, buffer)
    584         else:

timeout: The read operation timed out

During handling of the above exception, another exception occurred:

ReadTimeoutError                          Traceback (most recent call last)
~/anaconda3/envs/restart/lib/python3.5/site-packages/elasticsearch/connection/http_urllib3.py in perform_request(self, method, url, params, body, timeout, ignore, headers)
    171 
--> 172             response = self.pool.urlopen(method, url, body, retries=Retry(False), headers=request_headers, **kw)
    173             duration = time.time() - start

~/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    637             retries = retries.increment(method, url, error=e, _pool=self,
--> 638                                         _stacktrace=sys.exc_info()[2])
    639             retries.sleep()

~/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
    342             # Disabled, indicate to re-raise the error.
--> 343             raise six.reraise(type(error), error, _stacktrace)
    344 

~/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/packages/six.py in reraise(tp, value, tb)
    685             raise value.with_traceback(tb)
--> 686         raise value
    687 

~/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    615                                                      retries=retries,
--> 616                                                      **response_kw)
    617 

~/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/response.py in from_httplib(ResponseCls, r, **response_kw)
    524                            original_response=r,
--> 525                            **response_kw)
    526         return resp

~/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/response.py in __init__(self, body, headers, status, version, reason, strict, preload_content, decode_content, original_response, pool, connection, msg, retries, enforce_content_length, request_method, request_url)
    208         if preload_content and not self._body:
--> 209             self._body = self.read(decode_content=decode_content)
    210 

~/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/response.py in read(self, amt, decode_content, cache_content)
    458                         # Content-Length are caught.
--> 459                         raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
    460 

~/anaconda3/envs/restart/lib/python3.5/contextlib.py in __exit__(self, type, value, traceback)
     76             try:
---> 77                 self.gen.throw(type, value, traceback)
     78             except StopIteration as exc:

~/anaconda3/envs/restart/lib/python3.5/site-packages/urllib3/response.py in _error_catcher(self)
    364                 # there is yet no clean way to get at it from this context.
--> 365                 raise ReadTimeoutError(self._pool, None, 'Read timed out.')
    366 

ReadTimeoutError: HTTPSConnectionPool(host='restart-1-6927513437.eu-west-1.bonsaisearch.net', port=443): Read timed out.

During handling of the above exception, another exception occurred:

ConnectionTimeout                         Traceback (most recent call last)
<ipython-input-78-47a40abf95f3> in <module>
      6 }
      7 
----> 8 res2 = es.search(index = 'restart_index', body=doc)
      9 print("{} documents found".format(res2['hits']['total']))
     10 for doc in res2['hits']['hits']:

~/anaconda3/envs/restart/lib/python3.5/site-packages/elasticsearch/client/utils.py in _wrapped(*args, **kwargs)
     74                 if p in kwargs:
     75                     params[p] = kwargs.pop(p)
---> 76             return func(*args, params=params, **kwargs)
     77         return _wrapped
     78     return _wrapper

~/anaconda3/envs/restart/lib/python3.5/site-packages/elasticsearch/client/__init__.py in search(self, index, doc_type, body, params)
    658             index = '_all'
    659         return self.transport.perform_request('GET', _make_path(index,
--> 660             doc_type, '_search'), params=params, body=body)
    661 
    662     @query_params('_source', '_source_exclude', '_source_include',

~/anaconda3/envs/restart/lib/python3.5/site-packages/elasticsearch/transport.py in perform_request(self, method, url, headers, params, body)
    316                 delay = 2**attempt - 1
    317                 time.sleep(delay)
--> 318                 status, headers_response, data = connection.perform_request(method, url, params, body, headers=headers, ignore=ignore, timeout=timeout)
    319 
    320             except TransportError as e:

~/anaconda3/envs/restart/lib/python3.5/site-packages/elasticsearch/connection/http_urllib3.py in perform_request(self, method, url, params, body, timeout, ignore, headers)
    178                 raise SSLError('N/A', str(e), e)
    179             if isinstance(e, ReadTimeoutError):
--> 180                 raise ConnectionTimeout('TIMEOUT', str(e), e)
    181             raise ConnectionError('N/A', str(e), e)
    182 

ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPSConnectionPool(host='restart-1-6927513437.eu-west-1.bonsaisearch.net', port=443): Read timed out.)
In [ ]:
 
In [ ]:
# es load
# carts:
#    barchart % fixed by brand
#    keywords cloud
#    table top keyword by product category
#    timeline overall count
#    map count by location
#    table with whole comments, product category

# upload notebook, folium map and sankey chart to github
# share with Restart guy
# share with beautiful people guy, mention hive jars